Classifying Users#
import csv
import sqlite3
import pandas as pd
df=pd.read_csv('/Users/saisatvikhlakkimsetty/Downloads/ev_charging_patterns.csv')
df.dropna(inplace=True)
df.to_csv('/Users/saisatvikhlakkimsetty/Downloads/ev_charging_patterns_null_removed.csv')
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
Cell In[2], line 2
1 import pandas as pd
----> 2 df=pd.read_csv('/Users/saisatvikhlakkimsetty/Downloads/ev_charging_patterns.csv')
3 df.dropna(inplace=True)
4 df.to_csv('/Users/saisatvikhlakkimsetty/Downloads/ev_charging_patterns_null_removed.csv')
File /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1026, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)
1013 kwds_defaults = _refine_defaults_read(
1014 dialect,
1015 delimiter,
(...)
1022 dtype_backend=dtype_backend,
1023 )
1024 kwds.update(kwds_defaults)
-> 1026 return _read(filepath_or_buffer, kwds)
File /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/io/parsers/readers.py:620, in _read(filepath_or_buffer, kwds)
617 _validate_names(kwds.get("names", None))
619 # Create the parser.
--> 620 parser = TextFileReader(filepath_or_buffer, **kwds)
622 if chunksize or iterator:
623 return parser
File /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1620, in TextFileReader.__init__(self, f, engine, **kwds)
1617 self.options["has_index_names"] = kwds["has_index_names"]
1619 self.handles: IOHandles | None = None
-> 1620 self._engine = self._make_engine(f, self.engine)
File /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1880, in TextFileReader._make_engine(self, f, engine)
1878 if "b" not in mode:
1879 mode += "b"
-> 1880 self.handles = get_handle(
1881 f,
1882 mode,
1883 encoding=self.options.get("encoding", None),
1884 compression=self.options.get("compression", None),
1885 memory_map=self.options.get("memory_map", False),
1886 is_text=is_text,
1887 errors=self.options.get("encoding_errors", "strict"),
1888 storage_options=self.options.get("storage_options", None),
1889 )
1890 assert self.handles is not None
1891 f = self.handles.handle
File /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pandas/io/common.py:873, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
868 elif isinstance(handle, str):
869 # Check whether the filename is to be opened in binary mode.
870 # Binary mode does not support 'encoding' and 'newline'.
871 if ioargs.encoding and "b" not in ioargs.mode:
872 # Encoding
--> 873 handle = open(
874 handle,
875 ioargs.mode,
876 encoding=ioargs.encoding,
877 errors=errors,
878 newline="",
879 )
880 else:
881 # Binary mode
882 handle = open(handle, ioargs.mode)
FileNotFoundError: [Errno 2] No such file or directory: '/Users/saisatvikhlakkimsetty/Downloads/ev_charging_patterns.csv'
data_path='/Users/saisatvikhlakkimsetty/Downloads/ev_charging_patterns_null_removed.csv'
conn=sqlite3.connect('ev_chargingdb')
ev_cursor=conn.cursor()
Step-1 Creating Normalized Database
# Connect to SQLite database
conn = sqlite3.connect('ev_charging.db')
ev_cursor = conn.cursor()
# Drop existing tables (if any)
drop_queries = [
"DROP TABLE IF EXISTS Users;",
"DROP TABLE IF EXISTS ChargingStations;",
"DROP TABLE IF EXISTS ChargingSessions;",
"DROP TABLE IF EXISTS EnvironmentalData;"
]
for query in drop_queries:
ev_cursor.execute(query)
conn.commit()
# Create Users Table
create_users_table = """
CREATE TABLE Users (
user_id VARCHAR(50) PRIMARY KEY,
user_type VARCHAR(50),
vehicle_model VARCHAR(50),
vehicle_age_years FLOAT
);
"""
ev_cursor.execute(create_users_table)
conn.commit()
print("Users table created successfully.")
# Create ChargingStations Table
create_station_table = """
CREATE TABLE ChargingStations (
station_id VARCHAR(50) PRIMARY KEY,
station_location VARCHAR(100),
charger_type VARCHAR(50)
);
"""
ev_cursor.execute(create_station_table)
conn.commit()
print("ChargingStations table created successfully.")
# Create ChargingSessions Table (with foreign keys)
create_sessions_table = """
CREATE TABLE ChargingSessions (
session_id INTEGER PRIMARY KEY AUTOINCREMENT,
user_id VARCHAR(50),
station_id VARCHAR(50),
start_time TIMESTAMP,
end_time TIMESTAMP,
duration_hours FLOAT,
energy_consumed_kwh FLOAT,
charging_cost_usd FLOAT,
charging_rate_kw FLOAT,
soc_start_percent FLOAT,
soc_end_percent FLOAT,
time_of_day VARCHAR(50),
day_of_week VARCHAR(50),
FOREIGN KEY(user_id) REFERENCES Users(user_id),
FOREIGN KEY(station_id) REFERENCES ChargingStations(station_id)
);
"""
ev_cursor.execute(create_sessions_table)
conn.commit()
print("ChargingSessions table created successfully.")
# Create EnvironmentalData Table (with foreign key)
create_env_data_table = """
CREATE TABLE EnvironmentalData (
session_id INTEGER,
distance_driven_km FLOAT,
temperature_c FLOAT,
battery_capacity_kwh FLOAT,
PRIMARY KEY(session_id),
FOREIGN KEY(session_id) REFERENCES ChargingSessions(session_id)
);
"""
ev_cursor.execute(create_env_data_table)
conn.commit()
print("EnvironmentalData table created successfully.")
# Close connection
conn.close()
Users table created successfully.
ChargingStations table created successfully.
ChargingSessions table created successfully.
EnvironmentalData table created successfully.
import csv
import sqlite3
# Connect to SQLite database
conn = sqlite3.connect('ev_charging.db')
ev_cursor = conn.cursor()
# CSV path
data_path = '/Users/saisatvikhlakkimsetty/Downloads/ev_charging_patterns_null_removed.csv'
insert_user_query = """
INSERT OR IGNORE INTO Users (user_id, user_type, vehicle_model, vehicle_age_years)
VALUES (?, ?, ?, ?);
"""
insert_station_query = """
INSERT OR IGNORE INTO ChargingStations (station_id, station_location, charger_type)
VALUES (?, ?, ?);
"""
insert_sessions_query = """
INSERT INTO ChargingSessions (user_id, station_id, start_time, end_time, duration_hours,
energy_consumed_kwh, charging_cost_usd, charging_rate_kw,
soc_start_percent, soc_end_percent, time_of_day, day_of_week)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);
"""
insert_env_data_query = """
INSERT INTO EnvironmentalData (session_id, distance_driven_km, temperature_c, battery_capacity_kwh)
VALUES (?, ?, ?, ?);
"""
# Open CSV and insert data
with open(data_path, 'r') as file:
reader = csv.DictReader(file)
for row in reader:
try:
# Insert user data (ignores if already exists)
ev_cursor.execute(insert_user_query, (
row['User ID'],
row['User Type'],
row['Vehicle Model'],
float(row['Vehicle Age (years)'])
))
# Insert station data (ignores if already exists)
ev_cursor.execute(insert_station_query, (
row['Charging Station ID'],
row['Charging Station Location'],
row['Charger Type']
))
# Insert session data
ev_cursor.execute(insert_sessions_query, (
row['User ID'],
row['Charging Station ID'],
row['Charging Start Time'],
row['Charging End Time'],
float(row['Charging Duration (hours)']),
float(row['Energy Consumed (kWh)']),
float(row['Charging Cost (USD)']),
float(row['Charging Rate (kW)']),
float(row['State of Charge (Start %)']),
float(row['State of Charge (End %)']),
row['Time of Day'],
row['Day of Week']
))
# Get the last inserted session ID
session_id = ev_cursor.lastrowid
# Insert environmental data using the session_id from the last insert
ev_cursor.execute(insert_env_data_query, (
session_id,
float(row['Distance Driven (since last charge) (km)']),
float(row['Temperature (°C)']),
float(row['Battery Capacity (kWh)'])
))
# Commit changes to the database
conn.commit()
except sqlite3.IntegrityError as e:
print(f"Integrity error for User ID {row['User ID']}: {e}")
conn.rollback()
except Exception as e:
print(f"Error for User ID {row['User ID']}: {e}")
conn.rollback()
# Close connection
conn.close()
import pandas as pd
import sqlite3
# Connect to SQLite database
conn = sqlite3.connect('ev_charging.db')
# SQL query to join the tables and combine data
select_data_from_tables = """
SELECT
u.user_id,
u.user_type,
u.vehicle_model,
u.vehicle_age_years,
cs.station_id,
cs.station_location,
cs.charger_type,
s.start_time,
s.end_time,
s.duration_hours,
s.energy_consumed_kwh,
s.charging_cost_usd,
s.charging_rate_kw,
s.soc_start_percent,
s.soc_end_percent,
s.time_of_day,
s.day_of_week,
e.distance_driven_km,
e.temperature_c,
e.battery_capacity_kwh
FROM Users u
JOIN ChargingSessions s ON u.user_id = s.user_id
JOIN ChargingStations cs ON s.station_id = cs.station_id
JOIN EnvironmentalData e ON s.session_id = e.session_id;
"""
# Execute query and load the data into a DataFrame
df = pd.read_sql_query(select_data_from_tables, conn)
# Display the DataFrame
print(df)
# Close the connection
conn.close()
user_id user_type vehicle_model vehicle_age_years \
0 User_1 Commuter BMW i3 2.0
1 User_2 Casual Driver Hyundai Kona 3.0
2 User_3 Commuter Chevy Bolt 2.0
3 User_4 Long-Distance Traveler Hyundai Kona 1.0
4 User_5 Long-Distance Traveler Hyundai Kona 1.0
... ... ... ... ...
1126 User_1316 Commuter Nissan Leaf 7.0
1127 User_1317 Casual Driver BMW i3 4.0
1128 User_1318 Commuter Nissan Leaf 5.0
1129 User_1319 Commuter Chevy Bolt 5.0
1130 User_1320 Commuter Nissan Leaf 5.0
station_id station_location charger_type start_time \
0 Station_391 Houston DC Fast Charger 2024-01-01 00:00:00
1 Station_428 San Francisco Level 1 2024-01-01 01:00:00
2 Station_181 San Francisco Level 2 2024-01-01 02:00:00
3 Station_327 Houston Level 1 2024-01-01 03:00:00
4 Station_108 Los Angeles Level 1 2024-01-01 04:00:00
... ... ... ... ...
1126 Station_57 New York Level 2 2024-02-24 19:00:00
1127 Station_40 New York Level 1 2024-02-24 20:00:00
1128 Station_374 New York DC Fast Charger 2024-02-24 21:00:00
1129 Station_336 Chicago DC Fast Charger 2024-02-24 22:00:00
1130 Station_128 San Francisco Level 1 2024-02-24 23:00:00
end_time duration_hours energy_consumed_kwh \
0 2024-01-01 00:39:00 0.591363 60.712346
1 2024-01-01 03:01:00 3.133652 12.339275
2 2024-01-01 04:48:00 2.452653 19.128876
3 2024-01-01 06:42:00 1.266431 79.457824
4 2024-01-01 05:46:00 2.019765 19.629104
... ... ... ...
1126 2024-02-24 20:30:00 1.426444 42.011654
1127 2024-02-24 20:44:00 3.238212 68.185853
1128 2024-02-24 23:03:00 3.267122 18.895102
1129 2024-02-24 23:20:00 2.754527 13.756252
1130 2024-02-24 23:56:00 3.740970 63.652570
charging_cost_usd charging_rate_kw soc_start_percent soc_end_percent \
0 13.087717 36.389181 29.371576 86.119962
1 21.128448 30.677735 10.115778 84.664344
2 35.667270 27.513593 6.854604 69.917615
3 13.036239 32.882870 83.120003 99.624328
4 10.161471 10.215712 54.258950 63.743786
... ... ... ... ...
1126 22.081164 5.895475 39.204102 83.915952
1127 5.067806 18.388012 31.456375 93.096461
1128 37.255002 45.482066 71.903081 78.678879
1129 39.046146 38.148183 76.187997 65.926573
1130 10.863674 33.704226 59.338076 56.692439
time_of_day day_of_week distance_driven_km temperature_c \
0 Evening Tuesday 293.602111 27.947953
1 Morning Monday 112.112804 14.311026
2 Morning Thursday 71.799253 21.002002
3 Evening Saturday 199.577785 38.316313
4 Morning Saturday 203.661847 -7.834199
... ... ... ... ...
1126 Evening Sunday 239.601075 1.919655
1127 Evening Tuesday 164.376022 34.029775
1128 Evening Tuesday 226.519258 20.358761
1129 Afternoon Sunday 291.494076 24.134598
1130 Evening Monday 14.449236 -6.966593
battery_capacity_kwh
0 108.463007
1 100.000000
2 75.000000
3 50.000000
4 50.000000
... ...
1126 100.000000
1127 100.000000
1128 100.000000
1129 85.000000
1130 120.447195
[1131 rows x 20 columns]
Verifying the data retreived from the database
df.head()
| user_id | user_type | vehicle_model | vehicle_age_years | station_id | station_location | charger_type | start_time | end_time | duration_hours | energy_consumed_kwh | charging_cost_usd | charging_rate_kw | soc_start_percent | soc_end_percent | time_of_day | day_of_week | distance_driven_km | temperature_c | battery_capacity_kwh | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | User_1 | Commuter | BMW i3 | 2.0 | Station_391 | Houston | DC Fast Charger | 2024-01-01 00:00:00 | 2024-01-01 00:39:00 | 0.591363 | 60.712346 | 13.087717 | 36.389181 | 29.371576 | 86.119962 | Evening | Tuesday | 293.602111 | 27.947953 | 108.463007 |
| 1 | User_2 | Casual Driver | Hyundai Kona | 3.0 | Station_428 | San Francisco | Level 1 | 2024-01-01 01:00:00 | 2024-01-01 03:01:00 | 3.133652 | 12.339275 | 21.128448 | 30.677735 | 10.115778 | 84.664344 | Morning | Monday | 112.112804 | 14.311026 | 100.000000 |
| 2 | User_3 | Commuter | Chevy Bolt | 2.0 | Station_181 | San Francisco | Level 2 | 2024-01-01 02:00:00 | 2024-01-01 04:48:00 | 2.452653 | 19.128876 | 35.667270 | 27.513593 | 6.854604 | 69.917615 | Morning | Thursday | 71.799253 | 21.002002 | 75.000000 |
| 3 | User_4 | Long-Distance Traveler | Hyundai Kona | 1.0 | Station_327 | Houston | Level 1 | 2024-01-01 03:00:00 | 2024-01-01 06:42:00 | 1.266431 | 79.457824 | 13.036239 | 32.882870 | 83.120003 | 99.624328 | Evening | Saturday | 199.577785 | 38.316313 | 50.000000 |
| 4 | User_5 | Long-Distance Traveler | Hyundai Kona | 1.0 | Station_108 | Los Angeles | Level 1 | 2024-01-01 04:00:00 | 2024-01-01 05:46:00 | 2.019765 | 19.629104 | 10.161471 | 10.215712 | 54.258950 | 63.743786 | Morning | Saturday | 203.661847 | -7.834199 | 50.000000 |
df.dropna(inplace=True)
df.isna().sum()
user_id 0
user_type 0
vehicle_model 0
vehicle_age_years 0
station_id 0
station_location 0
charger_type 0
start_time 0
end_time 0
duration_hours 0
energy_consumed_kwh 0
charging_cost_usd 0
charging_rate_kw 0
soc_start_percent 0
soc_end_percent 0
time_of_day 0
day_of_week 0
distance_driven_km 0
temperature_c 0
battery_capacity_kwh 0
dtype: int64
%pip install scikit-learn
%pip install seaborn
Requirement already satisfied: scikit-learn in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (1.6.0)
Requirement already satisfied: numpy>=1.19.5 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from scikit-learn) (2.0.2)
Requirement already satisfied: scipy>=1.6.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from scikit-learn) (1.13.1)
Requirement already satisfied: joblib>=1.2.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from scikit-learn) (1.4.2)
Requirement already satisfied: threadpoolctl>=3.1.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from scikit-learn) (3.5.0)
Note: you may need to restart the kernel to use updated packages.
Requirement already satisfied: seaborn in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (0.13.2)
Requirement already satisfied: numpy!=1.24.0,>=1.20 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from seaborn) (2.0.2)
Requirement already satisfied: pandas>=1.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from seaborn) (2.2.3)
Requirement already satisfied: matplotlib!=3.6.1,>=3.4 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from seaborn) (3.9.2)
Requirement already satisfied: contourpy>=1.0.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.3.0)
Requirement already satisfied: cycler>=0.10 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (4.54.1)
Requirement already satisfied: kiwisolver>=1.3.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.4.7)
Requirement already satisfied: packaging>=20.0 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (24.1)
Requirement already satisfied: pillow>=8 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (10.4.0)
Requirement already satisfied: pyparsing>=2.3.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (3.1.4)
Requirement already satisfied: python-dateutil>=2.7 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from pandas>=1.2->seaborn) (2024.2)
Requirement already satisfied: tzdata>=2022.7 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from pandas>=1.2->seaborn) (2024.2)
Requirement already satisfied: six>=1.5 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from python-dateutil>=2.7->matplotlib!=3.6.1,>=3.4->seaborn) (1.16.0)
Note: you may need to restart the kernel to use updated packages.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
X = df.drop(columns=['user_type'])
y = df['user_type']
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, stratify=None, random_state=42
)
Stratifying the data/ target variable
user_type_distribution = df['user_type'].value_counts(normalize=True)
print("Original dataset 'User Type' distribution:")
print(user_type_distribution)
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['user_type'], random_state=42)
train_distribution = train_df['user_type'].value_counts(normalize=True)
test_distribution = test_df['user_type'].value_counts(normalize=True)
print("\nTrain set 'User Type' distribution:")
print(train_distribution)
print("\nTest set 'User Type' distribution:")
print(test_distribution)
Original dataset 'User Type' distribution:
user_type
Commuter 0.357206
Long-Distance Traveler 0.336870
Casual Driver 0.305924
Name: proportion, dtype: float64
Train set 'User Type' distribution:
user_type
Commuter 0.357301
Long-Distance Traveler 0.336283
Casual Driver 0.306416
Name: proportion, dtype: float64
Test set 'User Type' distribution:
user_type
Commuter 0.356828
Long-Distance Traveler 0.339207
Casual Driver 0.303965
Name: proportion, dtype: float64
pip install ipywidgets
Requirement already satisfied: ipywidgets in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (8.1.5)
Requirement already satisfied: comm>=0.1.3 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from ipywidgets) (0.2.2)
Requirement already satisfied: ipython>=6.1.0 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from ipywidgets) (8.26.0)
Requirement already satisfied: traitlets>=4.3.1 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from ipywidgets) (5.14.3)
Requirement already satisfied: widgetsnbextension~=4.0.12 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from ipywidgets) (4.0.13)
Requirement already satisfied: jupyterlab-widgets~=3.0.12 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from ipywidgets) (3.0.13)
Requirement already satisfied: decorator in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from ipython>=6.1.0->ipywidgets) (5.1.1)
Requirement already satisfied: jedi>=0.16 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from ipython>=6.1.0->ipywidgets) (0.19.1)
Requirement already satisfied: matplotlib-inline in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from ipython>=6.1.0->ipywidgets) (0.1.7)
Requirement already satisfied: prompt-toolkit<3.1.0,>=3.0.41 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from ipython>=6.1.0->ipywidgets) (3.0.47)
Requirement already satisfied: pygments>=2.4.0 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from ipython>=6.1.0->ipywidgets) (2.18.0)
Requirement already satisfied: stack-data in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from ipython>=6.1.0->ipywidgets) (0.6.3)
Requirement already satisfied: pexpect>4.3 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from ipython>=6.1.0->ipywidgets) (4.9.0)
Requirement already satisfied: parso<0.9.0,>=0.8.3 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from jedi>=0.16->ipython>=6.1.0->ipywidgets) (0.8.4)
Requirement already satisfied: ptyprocess>=0.5 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from pexpect>4.3->ipython>=6.1.0->ipywidgets) (0.7.0)
Requirement already satisfied: wcwidth in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from prompt-toolkit<3.1.0,>=3.0.41->ipython>=6.1.0->ipywidgets) (0.2.13)
Requirement already satisfied: executing>=1.2.0 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from stack-data->ipython>=6.1.0->ipywidgets) (2.0.1)
Requirement already satisfied: asttokens>=2.1.0 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from stack-data->ipython>=6.1.0->ipywidgets) (2.4.1)
Requirement already satisfied: pure-eval in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from stack-data->ipython>=6.1.0->ipywidgets) (0.2.3)
Requirement already satisfied: six>=1.12.0 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from asttokens>=2.1.0->stack-data->ipython>=6.1.0->ipywidgets) (1.16.0)
Note: you may need to restart the kernel to use updated packages.
Exploratory Data Analysis
from ydata_profiling import ProfileReport
profile = ProfileReport(df, title="Data Profile Report", explorative=True)
profile.to_file("ev_charging_patterns_profile_report.html")
profile.to_notebook_iframe()
“”” Observations:
Missing Values The dataset contains 66 missing values, which accounts for approximately 0.3% of the data. Missing values are present in the following columns: ‘Energy Consumed (kWh)’,’Charging Rate (kW)’,’Distance Driven (since last charge) (km)’
Duplicates A thorough check reveals that there are no duplicate rows in the dataset, indicating data uniqueness.
Categorical Variables The dataset includes the following categorical variables: ‘User Type’: Represents the type of user which is our traget ariable ‘Vehicle Model’: Indicates the vehicle model being used. ‘Charger Type’: Specifies the type of charger employed during the session. ‘Charging Station Location’: Describes the location of the charging station. All these variables have low cardinality (each contains between 3 and 5 unique values), making them manageable for encoding or analysis.
Numerical Variables Some inconsistencies and unusual patterns were identified: ‘State of Charge (Start %)’ and ‘State of Charge (End %)’: Both columns have values exceeding 100%, which is logically inconsistent and requires correction or capping.
‘Temperature (°C)’: The recorded temperatures range from -10.72°C to 73.17°C, suggesting potential outliers that need further investigation.
Strong correlations observed: ‘Energy Consumed (kWh)’ and ‘Charging Duration (hours)’ have a strong positive correlation (0.95). ‘Battery Capacity (kWh)’ and ‘Charging Rate (kW)’ show a moderate positive correlation (0.68).
Date/Time Variables
The dataset includes two datetime variables: ‘Charging Start Time’,’Charging End Time’ These variables can be leveraged to derive new features like charging duration, time of day, or day of the week.
Distributions
Some numerical features show distinct patterns: ‘Charging Duration (hours)’ and ‘Charging Cost (USD)’: Both exhibit right-skewed distributions, indicating a few sessions with unusually high values.
“””
df.head()
| user_id | user_type | vehicle_model | vehicle_age_years | station_id | station_location | charger_type | start_time | end_time | duration_hours | energy_consumed_kwh | charging_cost_usd | charging_rate_kw | soc_start_percent | soc_end_percent | time_of_day | day_of_week | distance_driven_km | temperature_c | battery_capacity_kwh | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | User_1 | Commuter | BMW i3 | 2.0 | Station_391 | Houston | DC Fast Charger | 2024-01-01 00:00:00 | 2024-01-01 00:39:00 | 0.591363 | 60.712346 | 13.087717 | 36.389181 | 29.371576 | 86.119962 | Evening | Tuesday | 293.602111 | 27.947953 | 108.463007 |
| 1 | User_2 | Casual Driver | Hyundai Kona | 3.0 | Station_428 | San Francisco | Level 1 | 2024-01-01 01:00:00 | 2024-01-01 03:01:00 | 3.133652 | 12.339275 | 21.128448 | 30.677735 | 10.115778 | 84.664344 | Morning | Monday | 112.112804 | 14.311026 | 100.000000 |
| 2 | User_3 | Commuter | Chevy Bolt | 2.0 | Station_181 | San Francisco | Level 2 | 2024-01-01 02:00:00 | 2024-01-01 04:48:00 | 2.452653 | 19.128876 | 35.667270 | 27.513593 | 6.854604 | 69.917615 | Morning | Thursday | 71.799253 | 21.002002 | 75.000000 |
| 3 | User_4 | Long-Distance Traveler | Hyundai Kona | 1.0 | Station_327 | Houston | Level 1 | 2024-01-01 03:00:00 | 2024-01-01 06:42:00 | 1.266431 | 79.457824 | 13.036239 | 32.882870 | 83.120003 | 99.624328 | Evening | Saturday | 199.577785 | 38.316313 | 50.000000 |
| 4 | User_5 | Long-Distance Traveler | Hyundai Kona | 1.0 | Station_108 | Los Angeles | Level 1 | 2024-01-01 04:00:00 | 2024-01-01 05:46:00 | 2.019765 | 19.629104 | 10.161471 | 10.215712 | 54.258950 | 63.743786 | Morning | Saturday | 203.661847 | -7.834199 | 50.000000 |
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1131 entries, 0 to 1130
Data columns (total 20 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 user_id 1131 non-null object
1 user_type 1131 non-null object
2 vehicle_model 1131 non-null object
3 vehicle_age_years 1131 non-null float64
4 station_id 1131 non-null object
5 station_location 1131 non-null object
6 charger_type 1131 non-null object
7 start_time 1131 non-null object
8 end_time 1131 non-null object
9 duration_hours 1131 non-null float64
10 energy_consumed_kwh 1131 non-null float64
11 charging_cost_usd 1131 non-null float64
12 charging_rate_kw 1131 non-null float64
13 soc_start_percent 1131 non-null float64
14 soc_end_percent 1131 non-null float64
15 time_of_day 1131 non-null object
16 day_of_week 1131 non-null object
17 distance_driven_km 1131 non-null float64
18 temperature_c 1131 non-null float64
19 battery_capacity_kwh 1131 non-null float64
dtypes: float64(10), object(10)
memory usage: 176.8+ KB
df.describe()
| vehicle_age_years | duration_hours | energy_consumed_kwh | charging_cost_usd | charging_rate_kw | soc_start_percent | soc_end_percent | distance_driven_km | temperature_c | battery_capacity_kwh | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 1131.000000 | 1131.000000 | 1131.000000 | 1131.000000 | 1131.000000 | 1131.000000 | 1131.000000 | 1131.000000 | 1131.000000 | 1131.000000 |
| mean | 3.604227 | 2.303177 | 42.915668 | 22.488351 | 26.014166 | 49.230036 | 75.012917 | 153.663101 | 15.305780 | 74.427818 |
| std | 2.324090 | 1.065878 | 22.201286 | 10.792504 | 14.010292 | 24.170435 | 16.920463 | 85.549751 | 14.751266 | 20.828350 |
| min | 0.000000 | 0.095314 | 0.045772 | 0.307085 | 1.472549 | 2.325959 | 7.604224 | 1.899538 | -10.724770 | 1.536540 |
| 25% | 2.000000 | 1.425281 | 24.248936 | 13.133925 | 13.949809 | 27.661992 | 62.264460 | 80.954993 | 3.009498 | 62.000000 |
| 50% | 4.000000 | 2.312675 | 42.865611 | 21.828088 | 25.838488 | 48.947886 | 75.100944 | 152.257515 | 14.641853 | 75.000000 |
| 75% | 6.000000 | 3.145998 | 61.544055 | 31.675804 | 37.508677 | 69.783816 | 88.245070 | 225.469628 | 27.824244 | 85.000000 |
| max | 11.688592 | 7.635145 | 152.238758 | 69.407743 | 97.342255 | 125.087227 | 177.708666 | 398.364775 | 73.169588 | 193.003074 |
df.dtypes
user_id object
user_type object
vehicle_model object
vehicle_age_years float64
station_id object
station_location object
charger_type object
start_time object
end_time object
duration_hours float64
energy_consumed_kwh float64
charging_cost_usd float64
charging_rate_kw float64
soc_start_percent float64
soc_end_percent float64
time_of_day object
day_of_week object
distance_driven_km float64
temperature_c float64
battery_capacity_kwh float64
dtype: object
df.duplicated().sum()
np.int64(0)
df.isna().sum()
user_id 0
user_type 0
vehicle_model 0
vehicle_age_years 0
station_id 0
station_location 0
charger_type 0
start_time 0
end_time 0
duration_hours 0
energy_consumed_kwh 0
charging_cost_usd 0
charging_rate_kw 0
soc_start_percent 0
soc_end_percent 0
time_of_day 0
day_of_week 0
distance_driven_km 0
temperature_c 0
battery_capacity_kwh 0
dtype: int64
df.dropna(subset=['vehicle_model','duration_hours','end_time'], inplace=True)
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
# Pie chart for the distribution of user types
plt.figure(figsize=(8, 5))
pie_plot = df['user_type'].value_counts().plot.pie(
autopct='%1.1f%%',
startangle=90,
colors=sns.color_palette('Set2'),
fontsize=12
)
plt.title('Distribution of User Types', fontsize=16)
plt.show()
%pip install -q dagshub mlflow
Note: you may need to restart the kernel to use updated packages.
Experiment 1
import dagshub
dagshub.init(repo_owner='saisatvikh', repo_name='final_repo', mlflow=True)
Accessing as saisatvikh
Initialized MLflow to track repo "saisatvikh/final_repo"
Repository saisatvikh/final_repo initialized!
import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer,OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression,RidgeClassifier
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
# from xgboost import XGBClassifier
from sklearn.metrics import classification_report, f1_score
categorical_features = X.select_dtypes(include=['object']).columns
for col in categorical_features:
X[col] = X[col].astype(str)
numeric_features = X.select_dtypes(include=['float64', 'int64']).columns
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='mean')),
('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
])
model_pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', LogisticRegression(solver='liblinear'))
])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
cv_results = cross_val_score(model_pipeline, X_train, y_train, cv=10, scoring='f1_macro')
print(f"Cross-validation F1-score (mean): {cv_results.mean():.4f}")
print(f"Cross-validation F1-score (std): {cv_results.std():.4f}")
Cross-validation F1-score (mean): 0.3656
Cross-validation F1-score (std): 0.0559
model_pipeline.fit(X_train, y_train)
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer()),
('scaler',
StandardScaler())]),
Index(['vehicle_age_years', 'duration_hours', 'energy_consumed_kwh',
'charging_cost_usd', 'charging_rate_kw', 'soc_start_percent',
'soc_end_percent', 'distance_driven_km', 'temperature_c',
'battery_capacity_kwh'],
dtype='object')),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
Index(['user_id', 'vehicle_model', 'station_id', 'station_location',
'charger_type', 'start_time', 'end_time', 'time_of_day', 'day_of_week'],
dtype='object'))])),
('classifier', LogisticRegression(solver='liblinear'))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer()),
('scaler',
StandardScaler())]),
Index(['vehicle_age_years', 'duration_hours', 'energy_consumed_kwh',
'charging_cost_usd', 'charging_rate_kw', 'soc_start_percent',
'soc_end_percent', 'distance_driven_km', 'temperature_c',
'battery_capacity_kwh'],
dtype='object')),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
Index(['user_id', 'vehicle_model', 'station_id', 'station_location',
'charger_type', 'start_time', 'end_time', 'time_of_day', 'day_of_week'],
dtype='object'))])),
('classifier', LogisticRegression(solver='liblinear'))])ColumnTransformer(transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', StandardScaler())]),
Index(['vehicle_age_years', 'duration_hours', 'energy_consumed_kwh',
'charging_cost_usd', 'charging_rate_kw', 'soc_start_percent',
'soc_end_percent', 'distance_driven_km', 'temperature_c',
'battery_capacity_kwh'],
dtype='object')),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
Index(['user_id', 'vehicle_model', 'station_id', 'station_location',
'charger_type', 'start_time', 'end_time', 'time_of_day', 'day_of_week'],
dtype='object'))])Index(['vehicle_age_years', 'duration_hours', 'energy_consumed_kwh',
'charging_cost_usd', 'charging_rate_kw', 'soc_start_percent',
'soc_end_percent', 'distance_driven_km', 'temperature_c',
'battery_capacity_kwh'],
dtype='object')SimpleImputer()
StandardScaler()
Index(['user_id', 'vehicle_model', 'station_id', 'station_location',
'charger_type', 'start_time', 'end_time', 'time_of_day', 'day_of_week'],
dtype='object')SimpleImputer(strategy='most_frequent')
OneHotEncoder(handle_unknown='ignore')
LogisticRegression(solver='liblinear')
y_pred = model_pipeline.predict(X_test)
logistic_f1_score= f1_score(y_test, y_pred,average='macro')
print(f"F1-score on test data: {logistic_f1_score:.4f}")
F1-score on test data: 0.3007
cm = confusion_matrix(y_test, y_pred)
tp = cm.diagonal()
tn = cm.sum(axis=1) - tp
fp = cm.sum(axis=0) - tp
fn = cm.sum() - (tp + fp + tn)
import mlflow
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
import numpy as np
with mlflow.start_run(nested=True):
# Log model and scaler as parameters
mlflow.log_param("model", "Logistic Regression")
mlflow.log_param("scaler", "StandardScaler")
# Cross-validation to compute the mean and std of f1 score
cv_results = cross_val_score(model_pipeline, X_train, y_train, cv=3, scoring='f1_macro')
mlflow.log_metric("cv_f1_mean", cv_results.mean())
mlflow.log_metric("cv_f1_std", cv_results.std())
# Train the model
model_pipeline.fit(X_train, y_train)
y_pred = model_pipeline.predict(X_test)
# Compute F1 score (macro average)
f1 = f1_score(y_test, y_pred, average='macro')
mlflow.log_metric("f1_test", f1)
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Choose a single class (e.g., class 0)
class_index = 0 # Change this to select a different class if needed
# Extract metrics for the selected class
tp = cm[class_index, class_index] # True Positive: Diagonal element for the class
fn = cm[class_index].sum() - tp # False Negative: Sum of row - TP
fp = cm[:, class_index].sum() - tp # False Positive: Sum of column - TP
tn = cm.sum() - (tp + fn + fp) # True Negative: Total sum - (TP + FN + FP)
# Log the metrics for the selected class (e.g., class 0)
mlflow.log_metric(f"TP_class_{class_index}", tp)
mlflow.log_metric(f"TN_class_{class_index}", tn)
mlflow.log_metric(f"FP_class_{class_index}", fp)
mlflow.log_metric(f"FN_class_{class_index}", fn)
# Log the model
mlflow.sklearn.log_model(model_pipeline, "logistic_regression_model")
# Print the logged metrics for the selected class
print(f"Logged F1 score to MLFlow: {f1:.4f}")
print(f"Logged CV results (mean): {cv_results.mean():.4f}")
print(f"Logged CV results (std): {cv_results.std():.4f}")
# Optionally, print the confusion matrix values for the selected class
print(f"Confusion Matrix:\n{cm}")
print(f"True Positives (Class {class_index}): {tp}")
print(f"True Negatives (Class {class_index}): {tn}")
print(f"False Positives (Class {class_index}): {fp}")
print(f"False Negatives (Class {class_index}): {fn}")
2024/12/20 16:11:37 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
Logged F1 score to MLFlow: 0.3007
Logged CV results (mean): 0.3572
Logged CV results (std): 0.0186
Confusion Matrix:
[[18 47 45]
[25 51 32]
[32 53 37]]
True Positives (Class 0): 18
True Negatives (Class 0): 173
False Positives (Class 0): 57
False Negatives (Class 0): 92
🏃 View run colorful-asp-716 at: https://dagshub.com/saisatvikh/final_repo.mlflow/#/experiments/0/runs/2473cb10a4e34cb0a266761edf1da0fa
🧪 View experiment at: https://dagshub.com/saisatvikh/final_repo.mlflow/#/experiments/0
Ridge Classifier
model_pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', RidgeClassifier())
])
model_pipeline.fit(X_train, y_train)
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer()),
('scaler',
StandardScaler())]),
Index(['vehicle_age_years', 'duration_hours', 'energy_consumed_kwh',
'charging_cost_usd', 'charging_rate_kw', 'soc_start_percent',
'soc_end_percent', 'distance_driven_km', 'temperature_c',
'battery_capacity_kwh'],
dtype='object')),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
Index(['user_id', 'vehicle_model', 'station_id', 'station_location',
'charger_type', 'start_time', 'end_time', 'time_of_day', 'day_of_week'],
dtype='object'))])),
('classifier', RidgeClassifier())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer()),
('scaler',
StandardScaler())]),
Index(['vehicle_age_years', 'duration_hours', 'energy_consumed_kwh',
'charging_cost_usd', 'charging_rate_kw', 'soc_start_percent',
'soc_end_percent', 'distance_driven_km', 'temperature_c',
'battery_capacity_kwh'],
dtype='object')),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
Index(['user_id', 'vehicle_model', 'station_id', 'station_location',
'charger_type', 'start_time', 'end_time', 'time_of_day', 'day_of_week'],
dtype='object'))])),
('classifier', RidgeClassifier())])ColumnTransformer(transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', StandardScaler())]),
Index(['vehicle_age_years', 'duration_hours', 'energy_consumed_kwh',
'charging_cost_usd', 'charging_rate_kw', 'soc_start_percent',
'soc_end_percent', 'distance_driven_km', 'temperature_c',
'battery_capacity_kwh'],
dtype='object')),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
Index(['user_id', 'vehicle_model', 'station_id', 'station_location',
'charger_type', 'start_time', 'end_time', 'time_of_day', 'day_of_week'],
dtype='object'))])Index(['vehicle_age_years', 'duration_hours', 'energy_consumed_kwh',
'charging_cost_usd', 'charging_rate_kw', 'soc_start_percent',
'soc_end_percent', 'distance_driven_km', 'temperature_c',
'battery_capacity_kwh'],
dtype='object')SimpleImputer()
StandardScaler()
Index(['user_id', 'vehicle_model', 'station_id', 'station_location',
'charger_type', 'start_time', 'end_time', 'time_of_day', 'day_of_week'],
dtype='object')SimpleImputer(strategy='most_frequent')
OneHotEncoder(handle_unknown='ignore')
RidgeClassifier()
y_pred = model_pipeline.predict(X_test)
ridge_f1_score= f1_score(y_test, y_pred,average='macro')
print(f"F1-score on test data: {ridge_f1_score:.4f}")
F1-score on test data: 0.3370
import mlflow
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
import numpy as np
with mlflow.start_run(nested=True):
# Log model and scaler as parameters
mlflow.log_param("model", "Ridge Classifier")
mlflow.log_param("scaler", "StandardScaler")
cv_results = cross_val_score(model_pipeline, X_train, y_train, cv=3, scoring='f1_macro')
mlflow.log_metric("cv_f1_mean", cv_results.mean())
mlflow.log_metric("cv_f1_std", cv_results.std())
model_pipeline.fit(X_train, y_train)
y_pred = model_pipeline.predict(X_test)
f1 = f1_score(y_test, y_pred, average='macro')
mlflow.log_metric("f1_test", f1)
cm = confusion_matrix(y_test, y_pred)
class_index = 0
tp = cm[class_index, class_index]
fn = cm[class_index].sum() - tp
fp = cm[:, class_index].sum() - tp
tn = cm.sum() - (tp + fn + fp)
mlflow.log_metric(f"TP_class_{class_index}", tp)
mlflow.log_metric(f"TN_class_{class_index}", tn)
mlflow.log_metric(f"FP_class_{class_index}", fp)
mlflow.log_metric(f"FN_class_{class_index}", fn)
mlflow.sklearn.log_model(model_pipeline, "ridge_model")
print(f"Logged F1 score to MLFlow: {f1:.4f}")
print(f"Logged CV results (mean): {cv_results.mean():.4f}")
print(f"Logged CV results (std): {cv_results.std():.4f}")
print(f"Confusion Matrix:\n{cm}")
print(f"True Positives (Class {class_index}): {tp}")
print(f"True Negatives (Class {class_index}): {tn}")
print(f"False Positives (Class {class_index}): {fp}")
print(f"False Negatives (Class {class_index}): {fn}")
2024/12/20 16:11:54 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
Logged F1 score to MLFlow: 0.3370
Logged CV results (mean): 0.3621
Logged CV results (std): 0.0105
Confusion Matrix:
[[25 48 37]
[25 53 30]
[35 48 39]]
True Positives (Class 0): 25
True Negatives (Class 0): 170
False Positives (Class 0): 60
False Negatives (Class 0): 85
🏃 View run incongruous-stag-353 at: https://dagshub.com/saisatvikh/final_repo.mlflow/#/experiments/0/runs/fe3cab3a15b940a2856d4f2211e829cd
🧪 View experiment at: https://dagshub.com/saisatvikh/final_repo.mlflow/#/experiments/0
Random Forest Classifier
model_pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', RandomForestClassifier())
])
model_pipeline.fit(X_train, y_train)
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer()),
('scaler',
StandardScaler())]),
Index(['vehicle_age_years', 'duration_hours', 'energy_consumed_kwh',
'charging_cost_usd', 'charging_rate_kw', 'soc_start_percent',
'soc_end_percent', 'distance_driven_km', 'temperature_c',
'battery_capacity_kwh'],
dtype='object')),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
Index(['user_id', 'vehicle_model', 'station_id', 'station_location',
'charger_type', 'start_time', 'end_time', 'time_of_day', 'day_of_week'],
dtype='object'))])),
('classifier', RandomForestClassifier())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer()),
('scaler',
StandardScaler())]),
Index(['vehicle_age_years', 'duration_hours', 'energy_consumed_kwh',
'charging_cost_usd', 'charging_rate_kw', 'soc_start_percent',
'soc_end_percent', 'distance_driven_km', 'temperature_c',
'battery_capacity_kwh'],
dtype='object')),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
Index(['user_id', 'vehicle_model', 'station_id', 'station_location',
'charger_type', 'start_time', 'end_time', 'time_of_day', 'day_of_week'],
dtype='object'))])),
('classifier', RandomForestClassifier())])ColumnTransformer(transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', StandardScaler())]),
Index(['vehicle_age_years', 'duration_hours', 'energy_consumed_kwh',
'charging_cost_usd', 'charging_rate_kw', 'soc_start_percent',
'soc_end_percent', 'distance_driven_km', 'temperature_c',
'battery_capacity_kwh'],
dtype='object')),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
Index(['user_id', 'vehicle_model', 'station_id', 'station_location',
'charger_type', 'start_time', 'end_time', 'time_of_day', 'day_of_week'],
dtype='object'))])Index(['vehicle_age_years', 'duration_hours', 'energy_consumed_kwh',
'charging_cost_usd', 'charging_rate_kw', 'soc_start_percent',
'soc_end_percent', 'distance_driven_km', 'temperature_c',
'battery_capacity_kwh'],
dtype='object')SimpleImputer()
StandardScaler()
Index(['user_id', 'vehicle_model', 'station_id', 'station_location',
'charger_type', 'start_time', 'end_time', 'time_of_day', 'day_of_week'],
dtype='object')SimpleImputer(strategy='most_frequent')
OneHotEncoder(handle_unknown='ignore')
RandomForestClassifier()
y_pred = model_pipeline.predict(X_test)
random_forest_f1= f1_score(y_test, y_pred,average='macro')
print(f"F1-score on test data: {random_forest_f1:.4f}")
F1-score on test data: 0.2968
import mlflow
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
import numpy as np
with mlflow.start_run(nested=True):
# Log model and scaler as parameters
mlflow.log_param("model", "Random Forest Classifier")
mlflow.log_param("scaler", "StandardScaler")
# Cross-validation to compute the mean and std of f1 score
cv_results = cross_val_score(model_pipeline, X_train, y_train, cv=3, scoring='f1_macro')
mlflow.log_metric("cv_f1_mean", cv_results.mean())
mlflow.log_metric("cv_f1_std", cv_results.std())
# Train the model
model_pipeline.fit(X_train, y_train)
y_pred = model_pipeline.predict(X_test)
# Compute F1 score (macro average)
f1 = f1_score(y_test, y_pred, average='macro')
mlflow.log_metric("f1_test", f1)
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Choose a single class (e.g., class 0)
class_index = 0 # Change this to select a different class if needed
# Extract metrics for the selected class
tp = cm[class_index, class_index] # True Positive: Diagonal element for the class
fn = cm[class_index].sum() - tp # False Negative: Sum of row - TP
fp = cm[:, class_index].sum() - tp # False Positive: Sum of column - TP
tn = cm.sum() - (tp + fn + fp) # True Negative: Total sum - (TP + FN + FP)
# Log the metrics for the selected class (e.g., class 0)
mlflow.log_metric(f"TP_class_{class_index}", tp)
mlflow.log_metric(f"TN_class_{class_index}", tn)
mlflow.log_metric(f"FP_class_{class_index}", fp)
mlflow.log_metric(f"FN_class_{class_index}", fn)
# Log the model
mlflow.sklearn.log_model(model_pipeline, "random_forest_model")
# Print the logged metrics for the selected class
print(f"Logged F1 score to MLFlow: {f1:.4f}")
print(f"Logged CV results (mean): {cv_results.mean():.4f}")
print(f"Logged CV results (std): {cv_results.std():.4f}")
# Optionally, print the confusion matrix values for the selected class
print(f"Confusion Matrix:\n{cm}")
print(f"True Positives (Class {class_index}): {tp}")
print(f"True Negatives (Class {class_index}): {tn}")
print(f"False Positives (Class {class_index}): {fp}")
print(f"False Negatives (Class {class_index}): {fn}")
2024/12/20 16:12:13 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
Logged F1 score to MLFlow: 0.3060
Logged CV results (mean): 0.3255
Logged CV results (std): 0.0264
Confusion Matrix:
[[11 70 29]
[11 78 19]
[15 77 30]]
True Positives (Class 0): 11
True Negatives (Class 0): 204
False Positives (Class 0): 26
False Negatives (Class 0): 99
🏃 View run powerful-hawk-261 at: https://dagshub.com/saisatvikh/final_repo.mlflow/#/experiments/0/runs/58fd4a3298504e74b452b11a6b9e28de
🧪 View experiment at: https://dagshub.com/saisatvikh/final_repo.mlflow/#/experiments/0
Polynomial Fetaures
import mlflow
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
numeric_features = [
'battery_capacity_kwh', 'charging_cost_usd',
'duration_hours', 'soc_end_percent', 'soc_start_percent',
'temperature_c', 'vehicle_age_years'
]
def add_polynomial_features(df, numeric_features, degree=2):
numeric_data = df[numeric_features]
poly = PolynomialFeatures(degree=degree, include_bias=False)
poly_features = poly.fit_transform(numeric_data)
poly_feature_names = poly.get_feature_names_out(numeric_features)
poly_df = pd.DataFrame(poly_features, columns=poly_feature_names, index=df.index)
df = pd.concat([df, poly_df], axis=1)
return df
df_with_poly_features = add_polynomial_features(df, numeric_features, degree=2)
mlflow.log_param("polynomial_degree", 2)
mlflow.log_param("numeric_features", numeric_features)
mlflow.log_metric("num_poly_features", len(df_with_poly_features.columns) - len(df.columns))
df_with_poly_features.to_csv("polynomial_features_dataset.csv", index=False)
mlflow.log_artifact("polynomial_features_dataset.csv")
print(df_with_poly_features.head())
print("Feature engineering results logged in MLflow.")
user_id user_type vehicle_model vehicle_age_years \
0 User_1 Commuter BMW i3 2.0
1 User_2 Casual Driver Hyundai Kona 3.0
2 User_3 Commuter Chevy Bolt 2.0
3 User_4 Long-Distance Traveler Hyundai Kona 1.0
4 User_5 Long-Distance Traveler Hyundai Kona 1.0
station_id station_location charger_type start_time \
0 Station_391 Houston DC Fast Charger 2024-01-01 00:00:00
1 Station_428 San Francisco Level 1 2024-01-01 01:00:00
2 Station_181 San Francisco Level 2 2024-01-01 02:00:00
3 Station_327 Houston Level 1 2024-01-01 03:00:00
4 Station_108 Los Angeles Level 1 2024-01-01 04:00:00
end_time duration_hours ... soc_end_percent^2 \
0 2024-01-01 00:39:00 0.591363 ... 7416.647931
1 2024-01-01 03:01:00 3.133652 ... 7168.051082
2 2024-01-01 04:48:00 2.452653 ... 4888.472921
3 2024-01-01 06:42:00 1.266431 ... 9925.006656
4 2024-01-01 05:46:00 2.019765 ... 4063.270258
soc_end_percent soc_start_percent soc_end_percent temperature_c \
0 2529.479020 2406.876668
1 856.445674 1211.633594
2 479.257596 1468.409884
3 8280.774408 3817.236936
4 3458.670880 -499.381507
soc_end_percent vehicle_age_years soc_start_percent^2 \
0 172.239925 862.689475
1 253.993031 102.328957
2 139.835230 46.985602
3 99.624328 6908.934892
4 63.743786 2944.033622
soc_start_percent temperature_c soc_start_percent vehicle_age_years \
0 820.875427 58.743152
1 144.767153 30.347333
2 143.960415 13.709209
3 3184.852063 83.120003
4 -425.075411 54.258950
temperature_c^2 temperature_c vehicle_age_years vehicle_age_years^2
0 781.088080 55.895906 4.0
1 204.805455 42.933077 9.0
2 441.084081 42.004004 4.0
3 1468.139854 38.316313 1.0
4 61.374674 -7.834199 1.0
[5 rows x 56 columns]
Feature engineering results logged in MLflow.
Attribute Combinations
import mlflow
import pandas as pd
import numpy as np
def ensure_numeric_columns(df, columns):
for col in columns:
df[col] = pd.to_numeric(df[col], errors='coerce')
return df
def add_attribute_combinations(df):
"""Creates new features by combining existing attributes."""
df = df.copy()
numeric_columns = [
'energy_consumed_kwh', 'duration_hours', 'distance_driven_km',
'charging_cost_usd', 'soc_end_percent', 'soc_start_percent', 'temperature_c'
]
# Ensure relevant columns are numeric
df = ensure_numeric_columns(df, numeric_columns)
# Add new features
df['energy_per_duration'] = df['energy_consumed_kwh'] / df['duration_hours']
df['distance_per_duration'] = df['distance_driven_km'] / df['duration_hours']
df['charging_cost_per_kwh'] = df['charging_cost_usd'] / df['energy_consumed_kwh']
df['soc_diff'] = df['soc_end_percent'] - df['soc_start_percent']
df['temperature_adjusted_energy'] = df['energy_consumed_kwh'] / (1 + np.abs(df['temperature_c']))
# Handle infinite or NaN values that may result from division
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(0, inplace=True)
return df
# Add the features to the dataset
df_with_features = add_attribute_combinations(df)
# Log parameters for the feature combinations
mlflow.log_param("attribute_combination_features", [
"energy_per_duration", "distance_per_duration", "charging_cost_per_kwh",
"soc_diff", "temperature_adjusted_energy"
])
# Log metrics (e.g., number of new features created)
mlflow.log_metric("num_new_features", len(df_with_features.columns) - len(df.columns))
# Log the dataset with new features as an artifact
df_with_features.to_csv("attribute_combined_dataset.csv", index=False)
mlflow.log_artifact("attribute_combined_dataset.csv")
# Display the updated DataFrame
print(df_with_features.head())
mlflow.end_run()
print("Feature combinations logged in MLflow.")
user_id user_type vehicle_model vehicle_age_years \
0 User_1 Commuter BMW i3 2.0
1 User_2 Casual Driver Hyundai Kona 3.0
2 User_3 Commuter Chevy Bolt 2.0
3 User_4 Long-Distance Traveler Hyundai Kona 1.0
4 User_5 Long-Distance Traveler Hyundai Kona 1.0
station_id station_location charger_type start_time \
0 Station_391 Houston DC Fast Charger 2024-01-01 00:00:00
1 Station_428 San Francisco Level 1 2024-01-01 01:00:00
2 Station_181 San Francisco Level 2 2024-01-01 02:00:00
3 Station_327 Houston Level 1 2024-01-01 03:00:00
4 Station_108 Los Angeles Level 1 2024-01-01 04:00:00
end_time duration_hours ... day_of_week distance_driven_km \
0 2024-01-01 00:39:00 0.591363 ... Tuesday 293.602111
1 2024-01-01 03:01:00 3.133652 ... Monday 112.112804
2 2024-01-01 04:48:00 2.452653 ... Thursday 71.799253
3 2024-01-01 06:42:00 1.266431 ... Saturday 199.577785
4 2024-01-01 05:46:00 2.019765 ... Saturday 203.661847
temperature_c battery_capacity_kwh charger_type_encoded \
0 27.947953 108.463007 0
1 14.311026 100.000000 1
2 21.002002 75.000000 2
3 38.316313 50.000000 1
4 -7.834199 50.000000 1
energy_per_duration distance_per_duration charging_cost_per_kwh soc_diff \
0 102.665033 496.483377 0.215569 56.748386
1 3.937666 35.777043 1.712292 74.548566
2 7.799260 29.274121 1.864577 63.063011
3 62.741544 157.590753 0.164065 16.504325
4 9.718509 100.834423 0.517674 9.484836
temperature_adjusted_energy
0 2.097293
1 0.805908
2 0.869415
3 2.020989
4 2.221945
[5 rows x 26 columns]
🏃 View run bald-smelt-249 at: https://dagshub.com/saisatvikh/final_repo.mlflow/#/experiments/0/runs/f05524583bbc4ff599e401b6231dcf6c
🧪 View experiment at: https://dagshub.com/saisatvikh/final_repo.mlflow/#/experiments/0
Feature combinations logged in MLflow.
Variance Threshold, Correlation Threshold, Fetaure Importaance
import pandas as pd
import numpy as np
import mlflow
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# Assuming df is your dataset and 'target' is the target column
target = 'user_type'
# Start MLflow run
mlflow.start_run(run_name="Feature_Selection_Experiment")
# Split dataset into features and target
X = df.drop(columns=[target])
y = df[target]
# Log parameters
mlflow.log_param("target_column", target)
# Identify categorical and numerical columns
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_features = X.select_dtypes(exclude=['object', 'category']).columns.tolist()
# Log categorical and numerical features
mlflow.log_param("categorical_features", categorical_features)
mlflow.log_param("numeric_features", numeric_features)
# Preprocessing pipeline for categorical and numerical features
numeric_transformer = StandardScaler()
# Convert categorical columns to string to avoid mixed types
X[categorical_features] = X[categorical_features].map(str)
# Categorical transformer for one-hot encoding with sparse=False (dense output)
categorical_transformer = OneHotEncoder(drop='first', sparse_output=False)
# Combine the transformers using ColumnTransformer
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
])
# Experiment 1: Feature selection using Correlation Threshold
def correlation_threshold(X, threshold=0.9):
# Check if the data is a sparse matrix and convert it to dense if so
if isinstance(X, np.ndarray):
X = pd.DataFrame(X)
elif hasattr(X, 'toarray'): # If it's a sparse matrix, convert it
X = pd.DataFrame(X.toarray())
correlation_matrix = X.corr().abs()
upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > threshold)]
X_selected = X.drop(columns=to_drop)
return X_selected
# Apply the preprocessor and correlation threshold
X_processed = preprocessor.fit_transform(X)
X_corr_selected = correlation_threshold(pd.DataFrame(X_processed), threshold=0.9)
# Log Correlation Threshold results
mlflow.log_param("correlation_threshold", 0.9)
mlflow.log_metric("num_features_after_correlation_threshold", X_corr_selected.shape[1])
# Experiment 2: Feature selection using Feature Importance (Random Forest)
def feature_importance(X, y):
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)
feature_importance = model.feature_importances_
sorted_idx = np.argsort(feature_importance)[::-1]
return X.iloc[:, sorted_idx]
# Apply the preprocessor and feature importance
X_fi_processed = preprocessor.fit_transform(X)
X_fi_selected = feature_importance(pd.DataFrame(X_fi_processed), y)
# Log Feature Importance results
mlflow.log_param("feature_importance_model", "Random Forest")
mlflow.log_metric("num_features_after_feature_importance", X_fi_selected.shape[1])
# Experiment 3: Feature selection using Variance Threshold
def variance_threshold(X, threshold=0.01):
selector = VarianceThreshold(threshold=threshold)
X_selected = selector.fit_transform(X)
return X_selected
# Apply the preprocessor and variance threshold
X_var_processed = preprocessor.fit_transform(X)
X_var_selected = variance_threshold(pd.DataFrame(X_var_processed), threshold=0.01)
# Log Variance Threshold results
mlflow.log_param("variance_threshold", 0.01)
mlflow.log_metric("num_features_after_variance_threshold", X_var_selected.shape[1])
# Train and evaluate models to validate the selected features
def evaluate_model(X_selected, y):
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
return accuracy_score(y_test, y_pred)
# Evaluate models after each feature selection method
accuracy_corr = evaluate_model(X_corr_selected, y)
accuracy_fi = evaluate_model(X_fi_selected, y)
accuracy_var = evaluate_model(X_var_selected, y)
# Log the model performance
mlflow.log_metric("accuracy_after_correlation_threshold", accuracy_corr)
mlflow.log_metric("accuracy_after_feature_importance", accuracy_fi)
mlflow.log_metric("accuracy_after_variance_threshold", accuracy_var)
# End the MLflow run
mlflow.end_run()
# Print summary
print(f"Accuracy after Correlation Threshold: {accuracy_corr:.4f}")
print(f"Accuracy after Feature Importance: {accuracy_fi:.4f}")
print(f"Accuracy after Variance Threshold: {accuracy_var:.4f}")
Priniple Component Analysis
import pandas as pd
import numpy as np
import mlflow
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
# Assuming df is your dataset and 'user_type' is the target column
target = 'user_type'
# Start MLflow run
mlflow.start_run(run_name="PCA_Dimensionality_Reduction")
# Split dataset into features and target
X = df.drop(columns=[target])
y = df[target]
# Log parameters
mlflow.log_param("target_column", target)
# Identify categorical and numerical columns
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_features = X.select_dtypes(exclude=['object', 'category']).columns.tolist()
# Log categorical and numerical features
mlflow.log_param("categorical_features", categorical_features)
mlflow.log_param("numeric_features", numeric_features)
# Preprocessing pipeline for categorical and numerical features
numeric_transformer = StandardScaler()
# Convert categorical columns to string to avoid mixed types
X[categorical_features] = X[categorical_features].applymap(str)
# Categorical transformer for one-hot encoding with sparse_output=False (dense output)
categorical_transformer = OneHotEncoder(drop='first', sparse_output=False)
# Combine the transformers using ColumnTransformer
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
])
# Apply preprocessing to the features
X_processed = preprocessor.fit_transform(X)
# Apply PCA for dimensionality reduction
pca = PCA()
X_pca = pca.fit_transform(X_processed)
# Log the explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_
mlflow.log_param("explained_variance_ratio", explained_variance_ratio.tolist())
# Create a scree plot to visualize the explained variance
plt.figure(figsize=(8, 6))
plt.plot(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio, marker='o', linestyle='--')
plt.title("Scree Plot: Explained Variance Ratio per Principal Component")
plt.xlabel("Principal Components")
plt.ylabel("Explained Variance Ratio")
plt.grid(True)
plt.tight_layout()
# Save the plot to a file
scree_plot_path = "scree_plot.png"
plt.savefig(scree_plot_path)
# Show the plot in the output
plt.show()
# Log the scree plot in MLFlow
mlflow.log_artifact(scree_plot_path)
# Determine how many components to keep based on cumulative explained variance
cumulative_variance = np.cumsum(explained_variance_ratio)
mlflow.log_param("cumulative_variance", cumulative_variance.tolist())
# Log the number of components based on the desired explained variance threshold
threshold = 0.95 # Choose the threshold for explained variance
num_components = np.argmax(cumulative_variance >= threshold) + 1
mlflow.log_param("num_components_selected", num_components)
# Apply PCA with the selected number of components
pca_selected = PCA(n_components=num_components)
X_pca_selected = pca_selected.fit_transform(X_processed)
# Log results for the reduced dataset
mlflow.log_param("num_features_before_pca", X_processed.shape[1])
mlflow.log_param("num_features_after_pca", X_pca_selected.shape[1])
# Train and evaluate model on PCA-reduced features
def evaluate_model(X_selected, y):
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
return accuracy_score(y_test, y_pred)
accuracy_pca = evaluate_model(X_pca_selected, y)
# Log the model performance
mlflow.log_metric("accuracy_after_pca", accuracy_pca)
# End the MLflow run
mlflow.end_run()
# Print the results
print(f"Accuracy after PCA: {accuracy_pca:.4f}")
print(f"Number of components selected: {num_components}")
/var/folders/5q/38fn8x6x05j5m61tvnrj7t040000gn/T/ipykernel_5137/4008606804.py:38: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.
X[categorical_features] = X[categorical_features].applymap(str)
🏃 View run PCA_Dimensionality_Reduction at: https://dagshub.com/saisatvikh/final_repo.mlflow/#/experiments/0/runs/197ae3b24a484273951a835c661a6069
🧪 View experiment at: https://dagshub.com/saisatvikh/final_repo.mlflow/#/experiments/0
Accuracy after PCA: 0.2952
Number of components selected: 788
Custome Experiment 2
import mlflow
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
import numpy as np
with mlflow.start_run(nested=True):
mlflow.log_param("model", "SVM Classifier")
mlflow.log_param("scaler", "StandardScaler")
cv_results = cross_val_score(model_pipeline, X_train, y_train, cv=3, scoring='f1_macro')
mlflow.log_metric("cv_f1_mean", cv_results.mean())
mlflow.log_metric("cv_f1_std", cv_results.std())
model_pipeline.fit(X_train, y_train)
y_pred = model_pipeline.predict(X_test)
svm_f1_score= f1_score(y_test, y_pred, average='macro')
mlflow.log_metric("f1_test", f1)
cm = confusion_matrix(y_test, y_pred)
class_index = 0
tp = cm[class_index, class_index]
fn = cm[class_index].sum() - tp
fp = cm[:, class_index].sum() - tp
tn = cm.sum() - (tp + fn + fp)
mlflow.log_metric(f"TP_class_{class_index}", tp)
mlflow.log_metric(f"TN_class_{class_index}", tn)
mlflow.log_metric(f"FP_class_{class_index}", fp)
mlflow.log_metric(f"FN_class_{class_index}", fn)
mlflow.sklearn.log_model(model_pipeline, "svm_model")
print(f"Logged F1 score to MLFlow: {f1:.4f}")
print(f"Logged CV results (mean): {cv_results.mean():.4f}")
print(f"Logged CV results (std): {cv_results.std():.4f}")
print(f"Confusion Matrix:\n{cm}")
print(f"True Positives (Class {class_index}): {tp}")
print(f"True Negatives (Class {class_index}): {tn}")
print(f"False Positives (Class {class_index}): {fp}")
print(f"False Negatives (Class {class_index}): {fn}")
Custom Experiment 2
import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.manifold import TSNE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
numerical_columns = df.select_dtypes(include=["float64", "int64"]).columns
imputer = SimpleImputer(strategy="median")
df[numerical_columns] = imputer.fit_transform(df[numerical_columns])
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df[numerical_columns]), columns=numerical_columns)
label_encoder = LabelEncoder()
df['charger_type_encoded'] = label_encoder.fit_transform(df['charger_type'])
tsne = TSNE(n_components=2, random_state=42)
df_tsne = tsne.fit_transform(df_scaled)
plt.figure(figsize=(8, 6))
plt.scatter(df_tsne[:, 0], df_tsne[:, 1], c=df['charger_type_encoded'], cmap='viridis', s=50, alpha=0.7)
plt.title('t-SNE visualization of the data')
plt.colorbar(label='user_type')
plt.show()
X = df_scaled
y = df['user_type']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
param_grid = {
'n_neighbors': [3, 5, 7, 9, 11],
'metric': ['euclidean', 'manhattan'],
}
knn = KNeighborsClassifier()
grid_search = GridSearchCV(knn, param_grid, cv=5, n_jobs=-1, scoring='accuracy')
with mlflow.start_run():
mlflow.log_param("cv_folds", 5)
mlflow.log_param("param_grid", param_grid)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_knn = grid_search.best_estimator_
mlflow.log_params(best_params)
y_pred = best_knn.predict(X_val)
cm = confusion_matrix(y_val, y_pred)
report = classification_report(y_val, y_pred, output_dict=True)
mlflow.log_metric("accuracy", report["accuracy"])
mlflow.log_metric("macro_avg_f1", report["macro avg"]["f1-score"])
mlflow.log_metric("weighted_avg_f1", report["weighted avg"]["f1-score"])
mlflow.log_metric("precision", report["macro avg"]["precision"])
mlflow.log_metric("recall", report["macro avg"]["recall"])
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=np.unique(y), yticklabels=np.unique(y))
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")
plt.savefig("confusion_matrix.png")
mlflow.log_artifact("confusion_matrix.png")
mlflow.sklearn.log_model(best_knn, "knn_model")
print(f"Best Parameters: {best_params}")
print(f"Confusion Matrix:\n{cm}")
print(f"Classification Report:\n{classification_report(y_val, y_pred)}")
F1- Scores Comparision
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import seaborn as sns
model_names = ['Logistic Regression','Random Forest','Ridge Classifier','SVM']
f1_scores = [logistic_f1_score,random_forest_f1,ridge_f1_score,svm_f1_score]
f1_scores_df = pd.DataFrame({
'Model': model_names,
'F1-Score': f1_scores
})
f1_df = f1_scores_df.sort_values(by='F1-Score', ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x='F1-Score', y='Model', data=f1_df, palette='viridis')
plt.title('Model Comparison Based on F1-Score', fontsize=16)
plt.xlabel('F1-Score', fontsize=14)
plt.ylabel('Model', fontsize=14)
plt.show()
/var/folders/5q/38fn8x6x05j5m61tvnrj7t040000gn/T/ipykernel_5137/261019361.py:17: FutureWarning:
Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.
sns.barplot(x='F1-Score', y='Model', data=f1_df, palette='viridis')